#This file produces images for the paper
rm(list=ls())
setwd("...") # Fill in working directory
library(dplyr)
library(caret)      #wrapper model training
library(ranger)     #Random Forests
library(plyr)
library(glmnet)     #Lasso packages
library(glmnetUtils)
library(kableExtra) #Table 1


#Data Prep----
#Set Path1 to be the path to factor_data_no_criminal_history
#Set Path2 to be the path to one_hot_data_no_criminal_history
factor_data = read.csv(Path1, stringsAsFactors = TRUE)%>%
  select(-c("X"))

factor_data = factor_data %>%
  mutate(sp_ethn_aw = relevel(sp_ethn_aw, ref = "White and Other"),
         pcimgen_rev = relevel(pcimgen_rev, ref = "3rd generation"),
         marital_status = relevel(marital_status, ref = "Married"),
         arrested17_24 = relevel(arrested17_24, ref = "never_arrested"))

factor_train = factor_data%>%
  filter(older_cohort)%>%
  select(-c("older_cohort"))
factor_test = factor_data%>%
  filter(!older_cohort)%>%
  select(-c("older_cohort"))

one_hot_data = read.csv(file = Path2, stringsAsFactors = TRUE) %>%
  select(-c("X"))
one_hot_data = one_hot_data %>% 
  mutate(arrested17_24 = relevel(arrested17_24, ref = "never_arrested"))


train = one_hot_data%>%
  filter(older_cohort)%>%
  select(-c("older_cohort"))
test = one_hot_data%>%
  filter(!older_cohort)%>%
  select(-c("older_cohort"))

#Random Forest Model ---------
set.seed(42)
model = ranger(arrested17_24 ~., data=factor_train, probability = TRUE, respect.unordered.factors = TRUE, importance = "impurity", splitrule = "extratrees")

set.seed(42)
model_young = ranger(arrested17_24 ~., data=factor_test, probability = TRUE, respect.unordered.factors = TRUE, importance = "impurity", splitrule = "extratrees")


train_probs = model$predictions[,"arrested"]
test_probs = predict(model, type = "response", data = factor_test)$predictions[,"arrested"]
young_probs = model_young$predictions[,"arrested"]

factor_train$pnas_rf = train_probs
factor_test$pnas_rf = test_probs
factor_test$pnas_rf_yy = young_probs

#LASSO Model ---------
set.seed(42)
model = cv.glmnet(arrested17_24 ~ ., 
                  data = train, 
                  alpha = 1,
                  nfolds = 10,
                  keep = TRUE,
                  family = "binomial")

set.seed(42)
model_young = cv.glmnet(arrested17_24 ~ ., 
                        data = test, 
                        alpha = 1,
                        nfolds = 10,
                        keep = TRUE,
                        family = "binomial")


lambda_min = model$lambda.min
train_probs = (model$fit.preval[,model$lambda==lambda_min] %>%
                 plogis())
test_probs = (predict(model, type = "response", s = lambda_min, newdata = test)) %>%
  as.numeric()
lambda_young = model_young$lambda.min
young_probs =(model_young$fit.preval[,model_young$lambda==lambda_young] %>%
                plogis())

factor_train$pnas_lasso = train_probs
factor_test$pnas_lasso = test_probs
factor_test$pnas_lasso_yy = young_probs

#Ridge Model ---------
set.seed(42)
model = cv.glmnet(arrested17_24 ~ ., 
                  data = train, 
                  alpha = 0,
                  nfolds = 10,
                  keep = TRUE,
                  family = "binomial")

set.seed(42)
model_young = cv.glmnet(arrested17_24 ~ ., 
                        data = test, 
                        alpha = 0,
                        nfolds = 10,
                        keep = TRUE,
                        family = "binomial")


lambda_min = model$lambda.min
train_probs = (model$fit.preval[,model$lambda==lambda_min] %>%
                 plogis())
test_probs = (predict(model, type = "response", s = lambda_min, newdata = test)) %>%
  as.numeric()
lambda_young = model_young$lambda.min
young_probs =(model_young$fit.preval[,model_young$lambda==lambda_young] %>%
                plogis())

factor_train$pnas_ridge = train_probs
factor_test$pnas_ridge = test_probs
factor_test$pnas_ridge_yy = young_probs

#Logistic Regression Model ---------
train_control = trainControl(method = "cv", number = 10, savePredictions = "final", classProbs = TRUE)

set.seed(42)
model = train(arrested17_24 ~., 
              data = train,
              trControl = train_control,
              method = "glm",
              family = binomial(),
              maxit = 50)

set.seed(42)
model_young = train(arrested17_24 ~., 
                    data = test,
                    trControl = train_control,
                    method = "glm",
                    family = binomial(),
                    maxit = 50)


train_probs = model$pred[order(model$pred$rowIndex),]$arrested
test_probs = predict(model, newdata = test, type = "prob")$arrested
young_probs = model_young$pred[order(model_young$pred$rowIndex),]$arrested

factor_train$pnas_lr = train_probs
factor_test$pnas_lr = test_probs
factor_test$pnas_lr_yy = young_probs

#Coefficient z-test--------
coef_z_test = function(coef1, se1, coef2, se2){
  p = rep(0, length(coef1))
  for(i in 1:length(coef1)){
    dif = coef1[i] - coef2[i]
    z = (dif)/sqrt(se1[i]**2+se2[i]**2)
    if (dif>0){
      p[i] = 2*pnorm(z,0,1,lower.tail = FALSE)
    }
    else{
      p[i] = 2*pnorm(z,0,1,lower.tail = TRUE)
    }
  }
  return(p)
}

#Generate Table 1
coefs_young = summary(model_young)$coefficients %>% 
  data.frame() 
names(coefs_young) = c("coef_estimate", "std.error", "z.value", "p.value")
coefs_young = coefs_young%>%
  select(-c("z.value"))

all_coefs = summary(model)$coefficients %>% 
  data.frame() %>%
  cbind(coefs_young) %>%
  select(-c("z.value"))

names(all_coefs) = c("coef_estimate", "std.error","p.value", "coef_estimate_young", "std.error_young", "p.value_young")

all_coefs$p_dif = coef_z_test(all_coefs$coef_estimate, all_coefs$std.error, all_coefs$coef_estimate_young, all_coefs$std.error_young)

rownames(all_coefs) = c("Intercept", "Sex: Male", "Race/Ethnicity: Black", "Race/Ethnicity: Latino", "Caregiver Immigrant Generation: 2nd", "Caregiver Immigrant Generation: 1st", "Family Poverty", "Caregiver Marital Status: DSW","Caregiver Marital Status: Cohabitating", "Caregiver Marital Status: Single", "Low Self-Control" )

#consider grouping rows
coef_table = all_coefs%>%
  mutate(p.value = 
           ifelse(p.value<0.01, paste(as.character(round(p.value,3)),"***", sep=""), 
                  ifelse(p.value<0.05, paste(as.character(round(p.value,3)),"**", sep=""),
                         ifelse(p.value<0.1, paste(as.character(round(p.value,3)),"*", sep=""),
                                as.character(round(p.value,3))))),
         p.value_young = 
           ifelse(p.value_young<0.01, paste(as.character(round(p.value_young,3)),"***", sep=""), 
                  ifelse(p.value_young<0.05, paste(as.character(round(p.value_young,3)),"**", sep=""),
                         ifelse(p.value_young<0.1, paste(as.character(round(p.value_young,3)),"*", sep=""),
                                as.character(round(p.value_young,3))))),
         p_dif = 
           ifelse(p_dif<0.01, paste(as.character(round(p_dif,3)),"***", sep=""), 
                  ifelse(p_dif<0.05, paste(as.character(round(p_dif,3)),"**", sep=""),
                         ifelse(p_dif<0.1, paste(as.character(round(p_dif,3)),"*", sep=""),
                                as.character(round(p_dif,3)))))) %>%
  kable(digits = 3,
        col.names = c("Coefficient Estimate", "Standard Error","P-Value", "Coefficient Estimate", "Standard Error","P-Value", "P-Value"),
        escape = FALSE,
        align = 'lllllll') %>%
  kable_styling(bootstrap_options = c("condensed"),
                fixed_thead = TRUE,
                full_width = FALSE)%>%
  add_header_above(c(" " = 1, "Model Trained on Older Cohort" = 3, "Model Trained on Younger Cohort" = 3, "Coefficient Difference" = 1))

save_kable(coef_table, 'table1.png', bs_theme = "flatly")

#Export Results
write.csv(factor_train, "older_cohort_results.csv")
write.csv(factor_test, "younger_cohort_results.csv")